Packages
library(DT)
library(adabag)
FALSE Loading required package: rpart
FALSE Loading required package: caret
FALSE Loading required package: ggplot2
FALSE Loading required package: lattice
FALSE Loading required package: foreach
FALSE Loading required package: doParallel
FALSE Loading required package: iterators
FALSE Loading required package: parallel
library(rpart.plot)
library(pROC)
FALSE Type 'citation("pROC")' for a citation.
FALSE
FALSE Attaching package: 'pROC'
FALSE The following objects are masked from 'package:stats':
FALSE
FALSE cov, smooth, var
library(summarytools)
library(corrplot)
FALSE corrplot 0.92 loaded
library(dplyr)
FALSE
FALSE Attaching package: 'dplyr'
FALSE The following objects are masked from 'package:stats':
FALSE
FALSE filter, lag
FALSE The following objects are masked from 'package:base':
FALSE
FALSE intersect, setdiff, setequal, union
library(GGally)
FALSE Registered S3 method overwritten by 'GGally':
FALSE method from
FALSE +.gg ggplot2
library(fastDummies)
library(ggcorrplot)
library(klaR)
FALSE Loading required package: MASS
FALSE
FALSE Attaching package: 'MASS'
FALSE The following object is masked from 'package:dplyr':
FALSE
FALSE select
library(psych)
FALSE
FALSE Attaching package: 'psych'
FALSE The following objects are masked from 'package:ggplot2':
FALSE
FALSE %+%, alpha
library(MASS)
library(devtools)
FALSE Loading required package: usethis
library(ggplot2)
library(ggthemes)
library(GGally)
library(caret)
library(splitTools)
library(rpart)
library(xgboost)
FALSE
FALSE Attaching package: 'xgboost'
FALSE The following object is masked from 'package:dplyr':
FALSE
FALSE slice
library(caTools)
library(dplyr)
library(caret)
library(naniar)
CM_Function <- function(cm) {
layout(matrix(c(1,1,2)))
par(mar=c(2,2,2,2))
plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
title('CONFUSION MATRIX', cex.main=2)
# create the matrix
rect(150, 430, 240, 370, col='#2F4F4E')
text(195, 435, 'No', cex=1.2)
rect(250, 430, 340, 370, col='#0D8387')
text(295, 435, 'Yes', cex=1.2)
text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
text(245, 450, 'Actual', cex=1.3, font=2)
rect(150, 305, 240, 365, col='#0D8387')
rect(250, 305, 340, 365, col='#2F4F4E')
text(140, 400, 'No', cex=1.2, srt=90)
text(140, 335, 'Yes', cex=1.2, srt=90)
# add in the cm results
res <- as.numeric(cm$table)
text(195, 400, res[1], cex=1.6, font=2, col='white')
text(195, 335, res[2], cex=1.6, font=2, col='white')
text(295, 400, res[3], cex=1.6, font=2, col='white')
text(295, 335, res[4], cex=1.6, font=2, col='white')
# add in the specifics
plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)
# add in the accuracy information
text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}
DATA AND QUICK FACTORING
df <- readxl::read_xls('Cchurn.xls')
df$international_plan <- factor(df$international_plan, levels = c('no', 'yes'), labels = c('0','1'))
df$voice_mail_plan <- factor(df$voice_mail_plan, levels = c('no', 'yes'), labels = c('0','1'))
df$churn <- factor(df$churn, levels = c('no', 'yes'), labels = c('0','1'))
SUMMARY
print(summarytools::dfSummary(df), method = 'render')
- We have no missing values -> perfect
- Heavily uneven counts of dependent variable (86 % no / 14 % yes)
-> maybe sample for equality / maybe not because we loose information
of other data
- Independent variables are on different scales -> standardize
- two (maybe three) categorical predictors: International plan /
voice_mail_plan (/ maybe number_customer_service_calls) -> dummy
encode -> not necessary as already 0 and 1
- Rest of data is numeric and most of the variables looks normally
distributed with exception of number_vmail_messages and totat_intl_calls
- transform these value to make them normal?
- maybe make parts of them categorical? (recieving voice mail or not,
calling internationally or not)
- or maybe the categorical values that we have already give an
indication for this
- Test normality of variables
- Can variables be combined? We have day / eve / night / intl calls
and for each of them minutes / calls / charge. Maybe we can combine this
into one metric. Maybe average cost per minute or average cost per
call?
CORRELATION PLOT BEFORE DATA ENGINEERING
df_numeric <- select_if(df, is.numeric) # Subset numeric columns with dplyr
M <- cor(df_numeric)
p.mat <- cor_pmat(df_numeric)
ggcorrplot(M, hc.order = TRUE, type = "lower", lab = TRUE, p.mat = p.mat, sig.level=0.05, lab_size = 2, tl.cex = 10,outline.col = "white", ggtheme = ggplot2::theme_minimal(), colors = c("#2F4F4E", "white", "#0D8387"))

Proves theory from before -> we can make one metric out of charge
and minutes –> charge / minutes
DATA ENGINEERING
df$total_day_charge_per_minute <- ifelse(df$total_day_minutes == 0, 0, df$total_day_charge / df$total_day_minutes)
df$total_eve_charge_per_minute <- ifelse(df$total_eve_minutes == 0, 0, df$total_eve_charge / df$total_eve_minutes)
df$total_night_charge_per_minute <- ifelse(df$total_night_minutes == 0, 0, df$total_night_charge / df$total_night_minutes)
df$total_intl_charge_per_minute <- ifelse(df$total_intl_minutes == 0, 0, df$total_intl_charge / df$total_intl_minutes)
df <- subset(df, select = -c(total_day_charge, total_day_minutes, total_eve_charge, total_eve_minutes, total_night_charge, total_night_minutes, total_intl_charge, total_intl_minutes))
CORRELATION PLOT AFTER DATA ENGINEERING
df_numeric <- select_if(df, is.numeric) # Subset numeric columns with dplyr
M <- cor(df_numeric)
p.mat <- cor_pmat(df_numeric)
ggcorrplot(M, hc.order = TRUE, type = "lower", lab = TRUE, p.mat = p.mat, sig.level=0.05, lab_size = 2, tl.cex = 10,outline.col = "white", ggtheme = ggplot2::theme_minimal(), colors = c("#2F4F4E", "white", "#0D8387"))

Now we have non-correlated data
HIGHER ORDER FEATURES
Only squaring as we have no negative data. Cubing would be needed
with negative data.
# squared
df2 <- df^2
df2 <- df2[,-c(2,3,10)]
colnames(df2) <- paste0(colnames(df2), '_sqd')
df <- cbind(df,df2)
Relationship between data in higher order
# theme_set(theme_minimal())
#
# ggpairs(
# data = df,
# columns = c(1:9, 11:25),
# mapping = aes(col = churn, alpha = .9)
# ) +
# scale_fill_colorblind() +
# scale_color_colorblind()
SAMPLING METHODS
As we have unbalanced data we need to use a sampling method to
balance the classes. Hereby there are four different methods. OVER /
UNDER / BOTH / ROSE.
library(ROSE)
FALSE Loaded ROSE 0.0-4
# OVER
df_OVER <- ovun.sample(churn~., data = df, method = "over")$data
table(df$churn)
FALSE
FALSE 0 1
FALSE 4293 707
table(df_OVER$churn)
FALSE
FALSE 0 1
FALSE 4293 4348
# UNDER
df_UNDER <- ovun.sample(churn~., data = df, method = "under")$data
table(df$churn)
FALSE
FALSE 0 1
FALSE 4293 707
table(df_UNDER$churn)
FALSE
FALSE 0 1
FALSE 687 707
# BOTH
df_BOTH <- ovun.sample(churn~., data = df, method = "both")$data
table(df$churn)
FALSE
FALSE 0 1
FALSE 4293 707
table(df_BOTH$churn)
FALSE
FALSE 0 1
FALSE 2509 2491
# ROSE
df_ROSE <- ROSE(churn ~ ., data = df, seed = 1, p = 0.5)$data
SAMPLING POST VISUALIZATION
# theme_set(theme_minimal())
#
# ggpairs(
# data = df_ROSE,
# columns = c(1:9, 11:25),
# mapping = aes(col = churn, alpha = .9)
# ) +
# scale_fill_colorblind() +
# scale_color_colorblind() +
# labs(title = "Machine Learning Project")
#
# ggpairs(
# data = df_OVER,
# columns = c(1:9, 11:25),
# mapping = aes(col = churn, alpha = .9)
# ) +
# scale_fill_colorblind() +
# scale_color_colorblind() +
# labs(title = "Machine Learning Project")
# ggpairs(
# data = df_UNDER,
# columns = c(1:9, 11:25),
# mapping = aes(col = churn, alpha = .9)
# ) +
# scale_fill_colorblind() +
# scale_color_colorblind() +
# labs(title = "Machine Learning Project")
# ggpairs(
# data = df_BOTH,
# columns = c(1:9, 11:25),
# mapping = aes(col = churn, alpha = .9)
# ) +
# scale_fill_colorblind() +
# scale_color_colorblind() +
# labs(title = "Machine Learning Project")
TRAIN AND TEST SPLIT
As we need to test the models we need to split the sampled data.
set.seed(1)
data <- df_OVER # choose which data to use df_ROSE / df_BOTH / df_UNDER / df_OVER / df
inds <- splitTools::partition(data$churn, p = c(train = 0.7, test = 0.3))
dftrain <- data[inds$train,]
dftest <- data[inds$test,]
SCALING
As some methods need scaled data we scale the data here to be
centered.
norm.value <- preProcess(dftrain, method = c("center", "scale"))
dftrain <- predict(norm.value, dftrain)
dftest <- predict(norm.value, dftest)
PREDICTIVE MODELS
NEURAL NET
# dftrain <- dftrain |>
# mutate_if(is.factor, as.character) |>
# mutate_if(is.character, as.numeric)
#
# library(neuralnet)
# mod.neural <- neuralnet(churn ~ ., data = dftrain, hidden=c(15,15), linear.output = FALSE)
#
# predicted.neural <- predict(mod.neural, dftest[,-c(10)])
#
# confmat.neural <- confusionMatrix(data=predicted.neural, reference = dftest$churn, positive = '1')
#
# CM_Function(confmat.neural)
#
# roc_score.neural =roc(factor(dftest$churn, ordered=TRUE), factor(predicted.neural, ordered=TRUE))
# plot(roc_score.neural ,main ="ROC curve")
BOOSTING
set.seed(123)
# train bagged model
mod.boost <- boosting(churn ~., data=dftrain)
predicted.boost <- factor(predict(mod.boost, dftest, type="class")$class)
confmat.boost <- confusionMatrix(data=predicted.boost, reference = dftest$churn, positive = '1')
CM_Function(confmat.boost)

roc_score.boost =roc(factor(dftest$churn, ordered=TRUE), factor(predicted.boost, ordered=TRUE))
FALSE Setting levels: control = 0, case = 1
FALSE Setting direction: controls < cases
plot(roc_score.boost ,main ="ROC curve")

CTREE
tree_full <- rpart(churn ~ .,
data = dftrain,
method = "class", # "class" because Y is a binary factor
minbucket = 1,
cp = 0.00001)
# Plot tree
rpart.plot(tree_full, yesno = TRUE, digits =-6)

min_xerr<- which.min(tree_full$cptable[,"xerror"]) # select minimum cross-validation error
cp_bp <- tree_full$cptable[min_xerr,"CP"] # find the corresponding CP value, to get the "best pruned " tree
mod.pruned_tree<- prune(tree_full, cp = cp_bp) # re-compute the tree with the selected Cp
rpart.plot(mod.pruned_tree, yesno = TRUE, digits =-3)

predicted.pruned_tree <- predict(mod.pruned_tree, dftest[,-c(10)], type = "class")
confmat.prunned_tree <- confusionMatrix(data=predicted.pruned_tree, reference = dftest$churn, positive = '1')
CM_Function(confmat.prunned_tree)

roc_score.prunned_tree =roc(factor(dftest$churn, ordered=TRUE), factor(predicted.pruned_tree, ordered=TRUE))
FALSE Setting levels: control = 0, case = 1
FALSE Setting direction: controls < cases
plot(roc_score.prunned_tree ,main ="ROC curve")

BAGGING
set.seed(123)
library(ipred)
FALSE
FALSE Attaching package: 'ipred'
FALSE The following object is masked from 'package:adabag':
FALSE
FALSE bagging
# train bagged model
ames_bag1 <- bagging(
formula = churn ~ .,
data = dftrain,
nbagg = 100,
coob = TRUE,
control = rpart.control(minsplit = 2, cp = 0)
)
ames_bag1
FALSE
FALSE Bagging classification trees with 100 bootstrap replications
FALSE
FALSE Call: bagging.data.frame(formula = churn ~ ., data = dftrain, nbagg = 100,
FALSE coob = TRUE, control = rpart.control(minsplit = 2, cp = 0))
FALSE
FALSE Out-of-bag estimate of misclassification error: 0.0458
predicted <- factor(ifelse(predict(ames_bag1, dftest[,-c(10)], type = 'prob')[,2] >= 0.5, 1, 0))
CM_Function(confusionMatrix(data=predicted, reference = dftest$churn, positive = '1'))

roc_score=roc(factor(dftest$churn, ordered=TRUE), factor(predicted, ordered=TRUE)) #AUC score
FALSE Setting levels: control = 0, case = 1
FALSE Setting direction: controls < cases
plot(roc_score ,main ="ROC curve")

KNN
set.seed(1)
df <- data.frame(k = seq(1, 30, 1), accuracy = rep(0, 30), sensitivity = rep(0, 30))
# iterating over different ks
for(i in 1:30){
# nearest neighbor
KNN1 <- knn3(y = dftrain$churn, x = dftrain[,-c(10)], k = i)
# predictions response
KNN1.pred.valid.resp <- predict(KNN1, dftest[,-c(10)], type = "class")
# predictions prob
KNN1.pred.valid.prob <- predict(KNN1, dftest[,-c(10)], type = "prob")[,2]
# Confusionmatrix
df$sensitivity[i] <- confusionMatrix(KNN1.pred.valid.resp, dftest$churn, positive = "1")$byClass[1]
df$accuracy[i] <- confusionMatrix(KNN1.pred.valid.resp, dftest$churn, positive = "1")$overall[1]
}
# plot the k's
ggplot(df, aes(x=k)) +
geom_line(aes(y = sensitivity, colour = "Sensitivity")) +
geom_line(aes(y = accuracy, colour = "Accuracy")) +
labs(x = "Number of k nearest neighbours",
y = "Accuracy / Sensitivity", title = "Accuracy / Sensitivity regarding k") +
theme_minimal() +
scale_y_continuous(name = "Sensitivity / Accuracy", limits = c(0.7, 1)) +
scale_color_manual(name = "Values", values = c("Sensitivity" = "darkblue", "Accuracy" = "red")) +
xlim (1, 30)

mod.knn <- knn3(y = dftrain$churn, x = dftrain[,-c(10)], k = 2)
predicted.knn <- predict(mod.knn, dftest[,-c(10)], type = "class")
confmat.knn <- confusionMatrix(data=predicted.knn, reference = dftest$churn, positive = '1')
CM_Function(confmat.knn)

roc_score.qda =roc(factor(dftest$churn, ordered=TRUE), factor(predicted.knn, ordered=TRUE))
FALSE Setting levels: control = 0, case = 1
FALSE Setting direction: controls < cases
plot(roc_score.qda ,main ="ROC curve")

QDA
mod.qda <- qda(churn ~., data = dftrain)
predicted.qda <- predict(mod.qda, dftest[,-c(10)])$class
confmat.qda <- confusionMatrix(data=predicted.qda, reference = dftest$churn, positive = '1')
CM_Function(confmat.qda)

roc_score.qda =roc(factor(dftest$churn, ordered=TRUE), factor(predicted.qda, ordered=TRUE))
FALSE Setting levels: control = 0, case = 1
FALSE Setting direction: controls < cases
plot(roc_score.qda ,main ="ROC curve")

QLOG
mod.log <- glm(churn ~., data = dftrain, family = binomial(link = "probit"))
s <- step(mod.log)
FALSE Start: AIC=6930.98
FALSE churn ~ account_length + international_plan + voice_mail_plan +
FALSE number_vmail_messages + total_day_calls + total_eve_calls +
FALSE total_night_calls + total_intl_calls + number_customer_service_calls +
FALSE total_day_charge_per_minute + total_eve_charge_per_minute +
FALSE total_night_charge_per_minute + total_intl_charge_per_minute +
FALSE account_length_sqd + number_vmail_messages_sqd + total_day_calls_sqd +
FALSE total_eve_calls_sqd + total_night_calls_sqd + total_intl_calls_sqd +
FALSE number_customer_service_calls_sqd + total_day_charge_per_minute_sqd +
FALSE total_eve_charge_per_minute_sqd + total_night_charge_per_minute_sqd +
FALSE total_intl_charge_per_minute_sqd
FALSE
FALSE Df Deviance AIC
FALSE - number_vmail_messages 1 6880.8 6928.8
FALSE - total_night_calls_sqd 1 6881.0 6929.0
FALSE - account_length_sqd 1 6881.0 6929.0
FALSE - total_night_calls 1 6881.0 6929.0
FALSE - total_day_charge_per_minute 1 6881.0 6929.0
FALSE - total_day_charge_per_minute_sqd 1 6881.0 6929.0
FALSE - total_intl_charge_per_minute 1 6881.1 6929.1
FALSE - total_day_calls 1 6881.1 6929.1
FALSE - total_eve_calls 1 6881.1 6929.1
FALSE - total_intl_charge_per_minute_sqd 1 6881.2 6929.2
FALSE - total_eve_calls_sqd 1 6881.3 6929.3
FALSE - account_length 1 6881.7 6929.7
FALSE - total_day_calls_sqd 1 6881.8 6929.8
FALSE <none> 6881.0 6931.0
FALSE - total_night_charge_per_minute_sqd 1 6883.1 6931.1
FALSE - total_night_charge_per_minute 1 6883.2 6931.2
FALSE - number_vmail_messages_sqd 1 6883.5 6931.5
FALSE - number_customer_service_calls 1 6892.1 6940.1
FALSE - total_intl_calls_sqd 1 6894.7 6942.7
FALSE - voice_mail_plan 1 6897.9 6945.9
FALSE - total_intl_calls 1 6902.7 6950.7
FALSE - total_eve_charge_per_minute 1 6917.5 6965.5
FALSE - total_eve_charge_per_minute_sqd 1 6917.5 6965.5
FALSE - number_customer_service_calls_sqd 1 7027.3 7075.3
FALSE - international_plan 1 7471.8 7519.8
FALSE
FALSE Step: AIC=6928.85
FALSE churn ~ account_length + international_plan + voice_mail_plan +
FALSE total_day_calls + total_eve_calls + total_night_calls + total_intl_calls +
FALSE number_customer_service_calls + total_day_charge_per_minute +
FALSE total_eve_charge_per_minute + total_night_charge_per_minute +
FALSE total_intl_charge_per_minute + account_length_sqd + number_vmail_messages_sqd +
FALSE total_day_calls_sqd + total_eve_calls_sqd + total_night_calls_sqd +
FALSE total_intl_calls_sqd + number_customer_service_calls_sqd +
FALSE total_day_charge_per_minute_sqd + total_eve_charge_per_minute_sqd +
FALSE total_night_charge_per_minute_sqd + total_intl_charge_per_minute_sqd
FALSE
FALSE Df Deviance AIC
FALSE - total_night_calls_sqd 1 6880.9 6926.9
FALSE - total_night_calls 1 6880.9 6926.9
FALSE - account_length_sqd 1 6880.9 6926.9
FALSE - total_intl_charge_per_minute 1 6880.9 6926.9
FALSE - total_intl_charge_per_minute_sqd 1 6880.9 6926.9
FALSE - total_day_calls 1 6881.0 6927.0
FALSE - total_eve_calls 1 6881.0 6927.0
FALSE - total_eve_calls_sqd 1 6881.2 6927.2
FALSE - account_length 1 6881.5 6927.5
FALSE - total_day_calls_sqd 1 6881.7 6927.7
FALSE <none> 6880.8 6928.8
FALSE - total_day_charge_per_minute 1 6886.7 6932.7
FALSE - total_day_charge_per_minute_sqd 1 6886.7 6932.7
FALSE - total_night_charge_per_minute_sqd 1 6888.9 6934.9
FALSE - total_night_charge_per_minute 1 6889.0 6935.0
FALSE - number_customer_service_calls 1 6897.5 6943.5
FALSE - total_intl_calls_sqd 1 6900.1 6946.1
FALSE - number_vmail_messages_sqd 1 6902.0 6948.0
FALSE - total_intl_calls 1 6907.8 6953.8
FALSE - total_eve_charge_per_minute 1 6923.4 6969.4
FALSE - total_eve_charge_per_minute_sqd 1 6923.4 6969.4
FALSE - voice_mail_plan 1 7023.4 7069.4
FALSE - number_customer_service_calls_sqd 1 7032.6 7078.6
FALSE - international_plan 1 7477.7 7523.7
FALSE
FALSE Step: AIC=6926.85
FALSE churn ~ account_length + international_plan + voice_mail_plan +
FALSE total_day_calls + total_eve_calls + total_night_calls + total_intl_calls +
FALSE number_customer_service_calls + total_day_charge_per_minute +
FALSE total_eve_charge_per_minute + total_night_charge_per_minute +
FALSE total_intl_charge_per_minute + account_length_sqd + number_vmail_messages_sqd +
FALSE total_day_calls_sqd + total_eve_calls_sqd + total_intl_calls_sqd +
FALSE number_customer_service_calls_sqd + total_day_charge_per_minute_sqd +
FALSE total_eve_charge_per_minute_sqd + total_night_charge_per_minute_sqd +
FALSE total_intl_charge_per_minute_sqd
FALSE
FALSE Df Deviance AIC
FALSE - account_length_sqd 1 6880.9 6924.9
FALSE - total_intl_charge_per_minute 1 6880.9 6924.9
FALSE - total_intl_charge_per_minute_sqd 1 6880.9 6924.9
FALSE - total_day_calls 1 6881.0 6925.0
FALSE - total_eve_calls 1 6881.0 6925.0
FALSE - total_night_calls 1 6881.1 6925.1
FALSE - total_eve_calls_sqd 1 6881.2 6925.2
FALSE - account_length 1 6881.5 6925.5
FALSE - total_day_calls_sqd 1 6881.7 6925.7
FALSE <none> 6880.9 6926.9
FALSE - total_day_charge_per_minute 1 6886.7 6930.7
FALSE - total_day_charge_per_minute_sqd 1 6886.7 6930.7
FALSE - total_night_charge_per_minute_sqd 1 6888.9 6932.9
FALSE - total_night_charge_per_minute 1 6889.0 6933.0
FALSE - number_customer_service_calls 1 6897.5 6941.5
FALSE - total_intl_calls_sqd 1 6900.1 6944.1
FALSE - number_vmail_messages_sqd 1 6902.0 6946.0
FALSE - total_intl_calls 1 6907.8 6951.8
FALSE - total_eve_charge_per_minute 1 6923.5 6967.5
FALSE - total_eve_charge_per_minute_sqd 1 6923.5 6967.5
FALSE - voice_mail_plan 1 7023.5 7067.5
FALSE - number_customer_service_calls_sqd 1 7032.6 7076.6
FALSE - international_plan 1 7477.7 7521.7
FALSE
FALSE Step: AIC=6924.87
FALSE churn ~ account_length + international_plan + voice_mail_plan +
FALSE total_day_calls + total_eve_calls + total_night_calls + total_intl_calls +
FALSE number_customer_service_calls + total_day_charge_per_minute +
FALSE total_eve_charge_per_minute + total_night_charge_per_minute +
FALSE total_intl_charge_per_minute + number_vmail_messages_sqd +
FALSE total_day_calls_sqd + total_eve_calls_sqd + total_intl_calls_sqd +
FALSE number_customer_service_calls_sqd + total_day_charge_per_minute_sqd +
FALSE total_eve_charge_per_minute_sqd + total_night_charge_per_minute_sqd +
FALSE total_intl_charge_per_minute_sqd
FALSE
FALSE Df Deviance AIC
FALSE - total_intl_charge_per_minute 1 6880.9 6922.9
FALSE - total_intl_charge_per_minute_sqd 1 6881.0 6923.0
FALSE - total_day_calls 1 6881.0 6923.0
FALSE - total_eve_calls 1 6881.0 6923.0
FALSE - total_night_calls 1 6881.1 6923.1
FALSE - total_eve_calls_sqd 1 6881.2 6923.2
FALSE - total_day_calls_sqd 1 6881.8 6923.8
FALSE <none> 6880.9 6924.9
FALSE - total_day_charge_per_minute 1 6886.7 6928.7
FALSE - total_day_charge_per_minute_sqd 1 6886.8 6928.8
FALSE - total_night_charge_per_minute_sqd 1 6889.0 6931.0
FALSE - total_night_charge_per_minute 1 6889.1 6931.1
FALSE - account_length 1 6895.3 6937.3
FALSE - number_customer_service_calls 1 6897.5 6939.5
FALSE - total_intl_calls_sqd 1 6900.1 6942.1
FALSE - number_vmail_messages_sqd 1 6902.0 6944.0
FALSE - total_intl_calls 1 6907.8 6949.8
FALSE - total_eve_charge_per_minute 1 6923.5 6965.5
FALSE - total_eve_charge_per_minute_sqd 1 6923.5 6965.5
FALSE - voice_mail_plan 1 7023.5 7065.5
FALSE - number_customer_service_calls_sqd 1 7032.7 7074.7
FALSE - international_plan 1 7478.1 7520.1
FALSE
FALSE Step: AIC=6922.91
FALSE churn ~ account_length + international_plan + voice_mail_plan +
FALSE total_day_calls + total_eve_calls + total_night_calls + total_intl_calls +
FALSE number_customer_service_calls + total_day_charge_per_minute +
FALSE total_eve_charge_per_minute + total_night_charge_per_minute +
FALSE number_vmail_messages_sqd + total_day_calls_sqd + total_eve_calls_sqd +
FALSE total_intl_calls_sqd + number_customer_service_calls_sqd +
FALSE total_day_charge_per_minute_sqd + total_eve_charge_per_minute_sqd +
FALSE total_night_charge_per_minute_sqd + total_intl_charge_per_minute_sqd
FALSE
FALSE Df Deviance AIC
FALSE - total_day_calls 1 6881.0 6921.0
FALSE - total_eve_calls 1 6881.1 6921.1
FALSE - total_night_calls 1 6881.2 6921.2
FALSE - total_eve_calls_sqd 1 6881.2 6921.2
FALSE - total_day_calls_sqd 1 6881.8 6921.8
FALSE <none> 6880.9 6922.9
FALSE - total_day_charge_per_minute 1 6886.8 6926.8
FALSE - total_day_charge_per_minute_sqd 1 6886.8 6926.8
FALSE - total_night_charge_per_minute_sqd 1 6889.0 6929.0
FALSE - total_night_charge_per_minute 1 6889.1 6929.1
FALSE - account_length 1 6895.4 6935.4
FALSE - total_intl_charge_per_minute_sqd 1 6895.5 6935.5
FALSE - number_customer_service_calls 1 6897.6 6937.6
FALSE - total_intl_calls_sqd 1 6900.1 6940.1
FALSE - number_vmail_messages_sqd 1 6902.1 6942.1
FALSE - total_intl_calls 1 6907.8 6947.8
FALSE - total_eve_charge_per_minute 1 6923.5 6963.5
FALSE - total_eve_charge_per_minute_sqd 1 6923.5 6963.5
FALSE - voice_mail_plan 1 7023.6 7063.6
FALSE - number_customer_service_calls_sqd 1 7032.7 7072.7
FALSE - international_plan 1 7478.4 7518.4
FALSE
FALSE Step: AIC=6921.01
FALSE churn ~ account_length + international_plan + voice_mail_plan +
FALSE total_eve_calls + total_night_calls + total_intl_calls +
FALSE number_customer_service_calls + total_day_charge_per_minute +
FALSE total_eve_charge_per_minute + total_night_charge_per_minute +
FALSE number_vmail_messages_sqd + total_day_calls_sqd + total_eve_calls_sqd +
FALSE total_intl_calls_sqd + number_customer_service_calls_sqd +
FALSE total_day_charge_per_minute_sqd + total_eve_charge_per_minute_sqd +
FALSE total_night_charge_per_minute_sqd + total_intl_charge_per_minute_sqd
FALSE
FALSE Df Deviance AIC
FALSE - total_eve_calls 1 6881.2 6919.2
FALSE - total_night_calls 1 6881.3 6919.3
FALSE - total_eve_calls_sqd 1 6881.3 6919.3
FALSE <none> 6881.0 6921.0
FALSE - total_day_charge_per_minute 1 6887.0 6925.0
FALSE - total_day_charge_per_minute_sqd 1 6887.0 6925.0
FALSE - total_night_charge_per_minute_sqd 1 6889.1 6927.1
FALSE - total_night_charge_per_minute 1 6889.3 6927.3
FALSE - account_length 1 6895.5 6933.5
FALSE - total_intl_charge_per_minute_sqd 1 6895.7 6933.7
FALSE - number_customer_service_calls 1 6897.6 6935.6
FALSE - total_intl_calls_sqd 1 6900.1 6938.1
FALSE - number_vmail_messages_sqd 1 6902.2 6940.2
FALSE - total_day_calls_sqd 1 6905.9 6943.9
FALSE - total_intl_calls 1 6907.8 6945.8
FALSE - total_eve_charge_per_minute 1 6923.6 6961.6
FALSE - total_eve_charge_per_minute_sqd 1 6923.6 6961.6
FALSE - voice_mail_plan 1 7023.7 7061.7
FALSE - number_customer_service_calls_sqd 1 7032.7 7070.7
FALSE - international_plan 1 7479.1 7517.1
FALSE
FALSE Step: AIC=6919.17
FALSE churn ~ account_length + international_plan + voice_mail_plan +
FALSE total_night_calls + total_intl_calls + number_customer_service_calls +
FALSE total_day_charge_per_minute + total_eve_charge_per_minute +
FALSE total_night_charge_per_minute + number_vmail_messages_sqd +
FALSE total_day_calls_sqd + total_eve_calls_sqd + total_intl_calls_sqd +
FALSE number_customer_service_calls_sqd + total_day_charge_per_minute_sqd +
FALSE total_eve_charge_per_minute_sqd + total_night_charge_per_minute_sqd +
FALSE total_intl_charge_per_minute_sqd
FALSE
FALSE Df Deviance AIC
FALSE - total_night_calls 1 6881.4 6917.4
FALSE - total_eve_calls_sqd 1 6883.0 6919.0
FALSE <none> 6881.2 6919.2
FALSE - total_day_charge_per_minute 1 6887.1 6923.1
FALSE - total_day_charge_per_minute_sqd 1 6887.1 6923.1
FALSE - total_night_charge_per_minute_sqd 1 6889.3 6925.3
FALSE - total_night_charge_per_minute 1 6889.4 6925.4
FALSE - account_length 1 6895.7 6931.7
FALSE - total_intl_charge_per_minute_sqd 1 6895.8 6931.8
FALSE - number_customer_service_calls 1 6897.8 6933.8
FALSE - total_intl_calls_sqd 1 6900.2 6936.2
FALSE - number_vmail_messages_sqd 1 6902.5 6938.5
FALSE - total_day_calls_sqd 1 6906.1 6942.1
FALSE - total_intl_calls 1 6907.9 6943.9
FALSE - total_eve_charge_per_minute 1 6923.7 6959.7
FALSE - total_eve_charge_per_minute_sqd 1 6923.7 6959.7
FALSE - voice_mail_plan 1 7024.0 7060.0
FALSE - number_customer_service_calls_sqd 1 7032.9 7068.9
FALSE - international_plan 1 7480.1 7516.1
FALSE
FALSE Step: AIC=6917.41
FALSE churn ~ account_length + international_plan + voice_mail_plan +
FALSE total_intl_calls + number_customer_service_calls + total_day_charge_per_minute +
FALSE total_eve_charge_per_minute + total_night_charge_per_minute +
FALSE number_vmail_messages_sqd + total_day_calls_sqd + total_eve_calls_sqd +
FALSE total_intl_calls_sqd + number_customer_service_calls_sqd +
FALSE total_day_charge_per_minute_sqd + total_eve_charge_per_minute_sqd +
FALSE total_night_charge_per_minute_sqd + total_intl_charge_per_minute_sqd
FALSE
FALSE Df Deviance AIC
FALSE - total_eve_calls_sqd 1 6883.2 6917.2
FALSE <none> 6881.4 6917.4
FALSE - total_day_charge_per_minute 1 6887.4 6921.4
FALSE - total_day_charge_per_minute_sqd 1 6887.4 6921.4
FALSE - total_night_charge_per_minute_sqd 1 6889.6 6923.6
FALSE - total_night_charge_per_minute 1 6889.7 6923.7
FALSE - account_length 1 6895.9 6929.9
FALSE - total_intl_charge_per_minute_sqd 1 6896.1 6930.1
FALSE - number_customer_service_calls 1 6897.9 6931.9
FALSE - total_intl_calls_sqd 1 6900.4 6934.4
FALSE - number_vmail_messages_sqd 1 6902.6 6936.6
FALSE - total_day_calls_sqd 1 6906.4 6940.4
FALSE - total_intl_calls 1 6908.1 6942.1
FALSE - total_eve_charge_per_minute 1 6924.1 6958.1
FALSE - total_eve_charge_per_minute_sqd 1 6924.1 6958.1
FALSE - voice_mail_plan 1 7024.0 7058.0
FALSE - number_customer_service_calls_sqd 1 7032.9 7066.9
FALSE - international_plan 1 7480.6 7514.6
FALSE
FALSE Step: AIC=6917.21
FALSE churn ~ account_length + international_plan + voice_mail_plan +
FALSE total_intl_calls + number_customer_service_calls + total_day_charge_per_minute +
FALSE total_eve_charge_per_minute + total_night_charge_per_minute +
FALSE number_vmail_messages_sqd + total_day_calls_sqd + total_intl_calls_sqd +
FALSE number_customer_service_calls_sqd + total_day_charge_per_minute_sqd +
FALSE total_eve_charge_per_minute_sqd + total_night_charge_per_minute_sqd +
FALSE total_intl_charge_per_minute_sqd
FALSE
FALSE Df Deviance AIC
FALSE <none> 6883.2 6917.2
FALSE - total_day_charge_per_minute 1 6889.4 6921.4
FALSE - total_day_charge_per_minute_sqd 1 6889.4 6921.4
FALSE - total_night_charge_per_minute_sqd 1 6891.5 6923.5
FALSE - total_night_charge_per_minute 1 6891.6 6923.6
FALSE - account_length 1 6897.4 6929.4
FALSE - total_intl_charge_per_minute_sqd 1 6898.0 6930.0
FALSE - number_customer_service_calls 1 6899.7 6931.7
FALSE - total_intl_calls_sqd 1 6902.4 6934.4
FALSE - number_vmail_messages_sqd 1 6904.5 6936.5
FALSE - total_day_calls_sqd 1 6908.2 6940.2
FALSE - total_intl_calls 1 6910.1 6942.1
FALSE - total_eve_charge_per_minute 1 6925.7 6957.7
FALSE - total_eve_charge_per_minute_sqd 1 6925.7 6957.7
FALSE - voice_mail_plan 1 7026.1 7058.1
FALSE - number_customer_service_calls_sqd 1 7029.5 7061.5
FALSE - international_plan 1 7479.0 7511.0
mod.log <- glm(s$formula, data = dftrain, family = binomial(link = "probit"))
predicted.log <- factor(ifelse(predict(mod.log, dftest[,-c(10)], type='response')>0.5,1,0))
confmat.log <- confusionMatrix(data=predicted.log, reference = dftest$churn, positive = '1')
CM_Function(confmat.log)

roc_score.log =roc(factor(dftest$churn, ordered=TRUE), factor(predicted.log, ordered=TRUE))
FALSE Setting levels: control = 0, case = 1
FALSE Setting direction: controls < cases
plot(roc_score.log ,main ="ROC curve")

GAUSSIAN SVM
library(e1071)
mod.svm = svm(formula = churn ~ .,
data = dftrain,
type = 'C-classification', # this is because we want to make a regression classification
kernel = 'radial')
predicted.svm <- predict(mod.svm, dftest[,-c(10)])
confmat.svm <- confusionMatrix(data=predicted.svm, reference = dftest$churn, positive = '1')
CM_Function(confmat.svm)

roc_score.svm =roc(factor(dftest$churn, ordered=TRUE), factor(predicted.svm, ordered=TRUE))
FALSE Setting levels: control = 0, case = 1
FALSE Setting direction: controls < cases
plot(roc_score.svm ,main ="ROC curve")

PRESENTATION ASSETS
rm(list = ls())
# READ DATA
df <- readxl::read_xls('Cchurn.xls')
df$international_plan <- factor(df$international_plan, levels = c('no', 'yes'), labels = c('0','1'))
df$voice_mail_plan <- factor(df$voice_mail_plan, levels = c('no', 'yes'), labels = c('0','1'))
df$churn <- factor(df$churn, levels = c('no', 'yes'), labels = c('No','Yes'))
# DATA ENGINEERING
df$total_day_charge_per_minute <- ifelse(df$total_day_minutes == 0, 0, df$total_day_charge / df$total_day_minutes)
df$total_eve_charge_per_minute <- ifelse(df$total_eve_minutes == 0, 0, df$total_eve_charge / df$total_eve_minutes)
df$total_night_charge_per_minute <- ifelse(df$total_night_minutes == 0, 0, df$total_night_charge / df$total_night_minutes)
df$total_intl_charge_per_minute <- ifelse(df$total_intl_minutes == 0, 0, df$total_intl_charge / df$total_intl_minutes)
df <- subset(df, select = -c(total_day_charge, total_day_minutes, total_eve_charge, total_eve_minutes, total_night_charge, total_night_minutes, total_intl_charge, total_intl_minutes))
colnames(df) <- c("Account Length", "International Plan","Voice Mail Plan","Voice Mail Messages","Total Days Calls","Total Evening Calls","Total Night Calls","Total Internation Calls","Total Customer Service Call","Customer Churn","Total Day Charge/Minute", "Total Evening Charge/Minute","Total Night Charge/Minute","Total International Charge/Minute")
PAIRS PLOT
# Load the necessary libraries
library(ggplot2)
library(GGally)
library(ggthemes)
# Set the main color palette
colors <- c("#0D8387", "#870D27")
# Create the ggpair plot
PAIRS1 <- ggpairs(df,columns = c(1:4), mapping = aes(col = `Customer Churn`, alpha = 0.9)) + scale_color_manual(values = colors) + scale_fill_manual(values = colors) + labs(title = "Customer Telecommunication Data", subtitle = "Customer Churn = Yes is red", caption="From Variable 1 to 4") + theme(plot.title = element_text(face = "bold"))
PAIRS1
FALSE `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
FALSE `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
FALSE `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
FALSE `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

PAIRS2 <- ggpairs(df,columns = c(5:9), mapping = aes(col = `Customer Churn`, alpha = 0.9)) + scale_color_manual(values = colors) + scale_fill_manual(values = colors) + labs(title = "Customer Telecommunication Data", subtitle = "Customer Churn = Yes is red", caption="From Variable 5 to 9") + theme(plot.title = element_text(face = "bold"))
PAIRS2

PAIRS3 <- ggpairs(df,columns = c(10:10), mapping = aes(col = `Customer Churn`, alpha = 0.9)) + scale_color_manual(values = colors) + scale_fill_manual(values = colors) + labs(title = "Customer Telecommunication Data", subtitle = "", y="Count") + theme(plot.title = element_text(face = "bold")) + theme_minimal() + theme(plot.title = element_text(face = "bold")) + annotate("text", x = 2, y = 900, label = "14.14%", colour = "#870D27", size=8) + annotate("text", x = 1, y = 4500, label = "85.86%", colour = "#0D8387", size=8) + theme(axis.text.x=element_text(size=16))
PAIRS3

PAIRS4 <- ggpairs(df,columns = c(11:14), mapping = aes(col = `Customer Churn`, alpha = 0.9)) + scale_color_manual(values = colors) + scale_fill_manual(values = colors) + labs(title = "Customer Telecommunication Data", subtitle = "Customer Churn = Yes is red",caption="From Variable 11 to 14") + theme(plot.title = element_text(face = "bold"))
PAIRS4

df$`Customer Churn` <- factor(df$`Customer Churn`, levels = c('No', 'Yes'), labels = c(0,1))
df$`Customer Churn` <- as.integer(df$`Customer Churn`)
df$`Customer Churn` <- df$`Customer Churn` -1
Proportions_Churn <- sum(df$`Customer Churn`[df$`Customer Churn` == 1])/nrow(df)
Proportions_No_Churn <- 1-Proportions_Churn
Proportions of Customer who churned => 14.14% Versus 85.86% who
didn’t churn.